{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "90d43b56-97e5-45e2-8e67-4488ed31d2df",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Run PyTorchJob From Function\n",
    "\n",
    "In this Notebook we are going to create [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/).\n",
    "\n",
    "The PyTorchJob will run distributive training using [DistributedDataParallel strategy](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8bb6564-fde3-4c28-841c-012122643dd9",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Install Kubeflow Python SDKs\n",
    "\n",
    "You need to install PyTorch packages and Kubeflow SDKs to run this Notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d49f072e-2221-48bb-9f6d-561713d1a45c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch==1.12.1\n",
    "!pip install torchvision==0.13.1\n",
    "\n",
    "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
    "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9331a05-9127-4b3a-8077-31157e267827",
   "metadata": {},
   "source": [
    "## Create Train Script for CNN Model\n",
    "\n",
    "This is simple **Convolutional Neural Network (CNN)** model for recognizing different picture of clothing using [Fashion MNIST Dataset](https://github.com/zalandoresearch/fashion-mnist)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def train_pytorch_model():\n",
    "    import logging\n",
    "    import os\n",
    "    from torchvision import transforms, datasets\n",
    "    import torch\n",
    "    from torch import nn\n",
    "    import torch.nn.functional as F\n",
    "    import torch.distributed as dist\n",
    "\n",
    "    logging.basicConfig(\n",
    "        format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
    "        datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n",
    "        level=logging.DEBUG,\n",
    "    )\n",
    "\n",
    "    # Create PyTorch CNN Model.\n",
    "    class Net(nn.Module):\n",
    "        def __init__(self):\n",
    "            super(Net, self).__init__()\n",
    "            self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
    "            self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
    "            self.fc1 = nn.Linear(4 * 4 * 50, 500)\n",
    "            self.fc2 = nn.Linear(500, 10)\n",
    "\n",
    "        def forward(self, x):\n",
    "            x = F.relu(self.conv1(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = F.relu(self.conv2(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = x.view(-1, 4 * 4 * 50)\n",
    "            x = F.relu(self.fc1(x))\n",
    "            x = self.fc2(x)\n",
    "            return F.log_softmax(x, dim=1)\n",
    "\n",
    "    # Get dist parameters.\n",
    "    # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.\n",
    "    RANK = int(os.environ[\"RANK\"])\n",
    "    WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
    "    \n",
    "    model = Net()\n",
    "    # Attach model to DistributedDataParallel strategy.\n",
    "    dist.init_process_group(backend=\"gloo\", rank=RANK, world_size=WORLD_SIZE)\n",
    "    Distributor = nn.parallel.DistributedDataParallel\n",
    "    model = Distributor(model)\n",
    "\n",
    "    # Split batch size for each worker.\n",
    "    batch_size = int(128 / WORLD_SIZE)\n",
    "\n",
    "    # Get Fashion MNIST DataSet.\n",
    "    train_loader = torch.utils.data.DataLoader(\n",
    "        datasets.FashionMNIST(\n",
    "            \"./data\",\n",
    "            train=True,\n",
    "            download=True,\n",
    "            transform=transforms.Compose([transforms.ToTensor()]),\n",
    "        ),\n",
    "        batch_size=batch_size,\n",
    "    )\n",
    "\n",
    "    # Start Training.\n",
    "    logging.info(f\"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}\")\n",
    "    for epoch in range(1):\n",
    "        model.train()\n",
    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
    "\n",
    "        for batch_idx, (data, target) in enumerate(train_loader):\n",
    "            optimizer.zero_grad()\n",
    "            output = model(data)\n",
    "            loss = F.nll_loss(output, target)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            if batch_idx % 10 == 0:\n",
    "                logging.info(\n",
    "                    \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n",
    "                        epoch,\n",
    "                        batch_idx * len(data),\n",
    "                        len(train_loader.dataset),\n",
    "                        100.0 * batch_idx / len(train_loader),\n",
    "                        loss.item(),\n",
    "                    )\n",
    "                )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cfe8739-1f94-476a-80e3-dd6e3237d9ed",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-09-01T19:32:37.813779Z",
     "iopub.status.busy": "2022-09-01T19:32:37.812759Z",
     "iopub.status.idle": "2022-09-01T19:32:37.827050Z",
     "shell.execute_reply": "2022-09-01T19:32:37.825186Z",
     "shell.execute_reply.started": "2022-09-01T19:32:37.813690Z"
    }
   },
   "source": [
    "## Run Training Locally in the Notebook\n",
    "\n",
    "We are going to download Fashion MNIST Dataset and start local training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-12T18:21:28Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "851b228ae0324915882f834224abe134",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/26421880 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c8dde30f1c2544f69c4f51331e0156c5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/29515 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "04cb10c56f73404d997b1b31221f5b10",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4422102 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd74e22f50034e889c4b3f9e7fff3f0c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/5148 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Processing...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.8/site-packages/torchvision/datasets/mnist.py:502: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:143.)\n",
      "  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)\n",
      "2022-09-12T18:31:05Z INFO     Start training for RANK: 0. WORLD_SIZE: 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-09-12T18:31:05Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3061\n",
      "2022-09-12T18:31:05Z INFO     Reducer buckets have been rebuilt in this iteration.\n",
      "2022-09-12T18:31:06Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2979\n",
      "2022-09-12T18:31:07Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2926\n",
      "2022-09-12T18:31:08Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2796\n",
      "2022-09-12T18:31:10Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2838\n",
      "2022-09-12T18:31:11Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2751\n",
      "2022-09-12T18:31:12Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2683\n",
      "2022-09-12T18:31:13Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=2.2443\n",
      "2022-09-12T18:31:15Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=2.2341\n",
      "2022-09-12T18:31:16Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=2.1962\n",
      "2022-09-12T18:31:17Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=2.1701\n",
      "2022-09-12T18:31:18Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=2.1368\n",
      "2022-09-12T18:31:20Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=2.0717\n",
      "2022-09-12T18:31:21Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=1.9831\n",
      "2022-09-12T18:31:22Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=1.8490\n",
      "2022-09-12T18:31:24Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=1.6720\n",
      "2022-09-12T18:31:25Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=1.4354\n",
      "2022-09-12T18:31:26Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=1.3926\n",
      "2022-09-12T18:31:28Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=1.2361\n",
      "2022-09-12T18:31:29Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=1.1674\n",
      "2022-09-12T18:31:30Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=0.9845\n",
      "2022-09-12T18:31:32Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9887\n",
      "2022-09-12T18:31:33Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=1.0034\n",
      "2022-09-12T18:31:34Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=1.1126\n",
      "2022-09-12T18:31:35Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.9854\n",
      "2022-09-12T18:31:37Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=0.9148\n",
      "2022-09-12T18:31:38Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8559\n",
      "2022-09-12T18:31:39Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.9737\n",
      "2022-09-12T18:31:41Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.7636\n",
      "2022-09-12T18:31:42Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7537\n",
      "2022-09-12T18:31:43Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=0.7180\n",
      "2022-09-12T18:31:45Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8250\n",
      "2022-09-12T18:31:46Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=0.8221\n",
      "2022-09-12T18:31:47Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.8605\n",
      "2022-09-12T18:31:49Z INFO     Train Epoch: 0 [43520/60000 (72%)]\tloss=0.7450\n",
      "2022-09-12T18:31:50Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.8031\n",
      "2022-09-12T18:31:51Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.8090\n",
      "2022-09-12T18:31:53Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.7897\n",
      "2022-09-12T18:31:54Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.8838\n",
      "2022-09-12T18:31:55Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.7967\n",
      "2022-09-12T18:31:57Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.7554\n",
      "2022-09-12T18:31:58Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8402\n",
      "2022-09-12T18:31:59Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.7859\n",
      "2022-09-12T18:32:00Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.6342\n",
      "2022-09-12T18:32:02Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.6881\n",
      "2022-09-12T18:32:04Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7722\n",
      "2022-09-12T18:32:05Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7504\n"
     ]
    }
   ],
   "source": [
    "# Set dist env variables to run the above training locally on the Notebook.\n",
    "import os\n",
    "os.environ[\"RANK\"] = \"0\"\n",
    "os.environ[\"WORLD_SIZE\"] = \"1\"\n",
    "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "os.environ[\"MASTER_PORT\"] = \"1234\"\n",
    "\n",
    "# Train Model locally in the Notebook.\n",
    "train_pytorch_model()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5aae47e3-be31-468e-8f38-89e1e2f1c764",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Start Distributive Training with PyTorchJob\n",
    "\n",
    "Before creating PyTorchJob, you have to create `TrainingClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n",
    "\n",
    "Kubeflow Training Operator automatically set the appropriate env variables (`MASTER_PORT`, `MASTER_ADDR`, `WORLD_SIZE`, `RANK`) for each PyTorchJob container."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorchJob kubeflow-user-example-com/train-pytorch has been created\n"
     ]
    }
   ],
   "source": [
    "from kubeflow.training import TrainingClient\n",
    "\n",
    "# Start PyTorchJob Training.\n",
    "pytorchjob_name = \"train-pytorch\"\n",
    "training_client = TrainingClient()\n",
    "\n",
    "training_client.create_pytorchjob_from_func(\n",
    "    name=pytorchjob_name,\n",
    "    func=train_pytorch_model,\n",
    "    num_worker_replicas=3, # How many PyTorch Workers will be run.\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e44c3ad7-62c4-4b58-b52a-15fd8746b772",
   "metadata": {},
   "source": [
    "### Check PyTorchJob Status\n",
    "\n",
    "Use `KubeflowClient` APIs to get information about created PyTorchJob."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4141f6c2-c38f-4972-b68a-35d150ef7485",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PyTorchJob Status: True\n"
     ]
    }
   ],
   "source": [
    "print(f\"PyTorchJob Status: {training_client.is_job_running(name=pytorchjob_name, job_kind='PyTorchJob')}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42e10587-7ac2-45bf-9c4f-d418e1585974",
   "metadata": {},
   "source": [
    "### Get PyTorchJob Pod Names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "49b53308-a19b-45e8-942f-4333e727ee48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['train-pytorch-master-0',\n",
       " 'train-pytorch-worker-0',\n",
       " 'train-pytorch-worker-1',\n",
       " 'train-pytorch-worker-2']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_client.get_job_pod_names(pytorchjob_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b91d332d-487c-4a95-937d-26ffb6199cda",
   "metadata": {
    "execution": {
     "iopub.status.busy": "2022-09-01T20:10:25.759950Z",
     "iopub.status.idle": "2022-09-01T20:10:25.760581Z",
     "shell.execute_reply": "2022-09-01T20:10:25.760353Z",
     "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z"
    },
    "tags": []
   },
   "source": [
    "### Get PyTorchJob Training Logs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The logs of pod train-pytorch-master-0:\n",
      " 2023-01-12T18:55:33Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0\n",
      "2023-01-12T18:55:33Z INFO     Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
      "100%|██████████| 26421880/26421880 [00:02<00:00, 12562567.98it/s]\n",
      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "100%|██████████| 29515/29515 [00:00<00:00, 211170.82it/s]\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
      "100%|██████████| 4422102/4422102 [00:00<00:00, 4511582.77it/s]\n",
      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
      "100%|██████████| 5148/5148 [00:00<00:00, 23675742.32it/s]\n",
      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "\n",
      "2023-01-12T18:55:39Z INFO     Start training for RANK: 0. WORLD_SIZE: 4\n",
      "2023-01-12T18:55:40Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3033\n",
      "2023-01-12T18:55:40Z INFO     Reducer buckets have been rebuilt in this iteration.\n",
      "2023-01-12T18:55:42Z INFO     Train Epoch: 0 [320/60000 (1%)]\tloss=2.3035\n",
      "2023-01-12T18:55:43Z INFO     Train Epoch: 0 [640/60000 (1%)]\tloss=2.2942\n",
      "2023-01-12T18:55:43Z INFO     Train Epoch: 0 [960/60000 (2%)]\tloss=2.2920\n",
      "2023-01-12T18:55:44Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2875\n",
      "2023-01-12T18:55:45Z INFO     Train Epoch: 0 [1600/60000 (3%)]\tloss=2.2658\n",
      "2023-01-12T18:55:46Z INFO     Train Epoch: 0 [1920/60000 (3%)]\tloss=2.2676\n",
      "2023-01-12T18:55:46Z INFO     Train Epoch: 0 [2240/60000 (4%)]\tloss=2.2092\n",
      "2023-01-12T18:55:47Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2292\n",
      "2023-01-12T18:55:47Z INFO     Train Epoch: 0 [2880/60000 (5%)]\tloss=2.2402\n",
      "2023-01-12T18:55:48Z INFO     Train Epoch: 0 [3200/60000 (5%)]\tloss=2.1984\n",
      "2023-01-12T18:55:48Z INFO     Train Epoch: 0 [3520/60000 (6%)]\tloss=2.1415\n",
      "2023-01-12T18:55:49Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.0092\n",
      "2023-01-12T18:55:49Z INFO     Train Epoch: 0 [4160/60000 (7%)]\tloss=1.8847\n",
      "2023-01-12T18:55:50Z INFO     Train Epoch: 0 [4480/60000 (7%)]\tloss=1.8625\n",
      "2023-01-12T18:55:51Z INFO     Train Epoch: 0 [4800/60000 (8%)]\tloss=1.5723\n",
      "2023-01-12T18:55:51Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=1.4135\n",
      "2023-01-12T18:55:52Z INFO     Train Epoch: 0 [5440/60000 (9%)]\tloss=1.3640\n",
      "2023-01-12T18:55:52Z INFO     Train Epoch: 0 [5760/60000 (10%)]\tloss=1.3703\n",
      "2023-01-12T18:55:53Z INFO     Train Epoch: 0 [6080/60000 (10%)]\tloss=1.1940\n",
      "2023-01-12T18:55:53Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=1.1059\n",
      "2023-01-12T18:55:54Z INFO     Train Epoch: 0 [6720/60000 (11%)]\tloss=1.2499\n",
      "2023-01-12T18:55:54Z INFO     Train Epoch: 0 [7040/60000 (12%)]\tloss=0.9975\n",
      "2023-01-12T18:55:55Z INFO     Train Epoch: 0 [7360/60000 (12%)]\tloss=1.0447\n",
      "2023-01-12T18:55:56Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=1.0539\n",
      "2023-01-12T18:55:56Z INFO     Train Epoch: 0 [8000/60000 (13%)]\tloss=1.2946\n",
      "2023-01-12T18:55:57Z INFO     Train Epoch: 0 [8320/60000 (14%)]\tloss=1.0458\n",
      "2023-01-12T18:55:57Z INFO     Train Epoch: 0 [8640/60000 (14%)]\tloss=1.1081\n",
      "2023-01-12T18:55:58Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=1.2158\n",
      "2023-01-12T18:56:01Z INFO     Train Epoch: 0 [9280/60000 (15%)]\tloss=0.6873\n",
      "2023-01-12T18:56:01Z INFO     Train Epoch: 0 [9600/60000 (16%)]\tloss=1.3140\n",
      "2023-01-12T18:56:02Z INFO     Train Epoch: 0 [9920/60000 (17%)]\tloss=0.9072\n",
      "2023-01-12T18:56:02Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=1.1416\n",
      "2023-01-12T18:56:03Z INFO     Train Epoch: 0 [10560/60000 (18%)]\tloss=1.2440\n",
      "2023-01-12T18:56:04Z INFO     Train Epoch: 0 [10880/60000 (18%)]\tloss=0.9684\n",
      "2023-01-12T18:56:04Z INFO     Train Epoch: 0 [11200/60000 (19%)]\tloss=0.7044\n",
      "2023-01-12T18:56:05Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=0.9956\n",
      "2023-01-12T18:56:05Z INFO     Train Epoch: 0 [11840/60000 (20%)]\tloss=1.1197\n",
      "2023-01-12T18:56:06Z INFO     Train Epoch: 0 [12160/60000 (20%)]\tloss=0.9295\n",
      "2023-01-12T18:56:06Z INFO     Train Epoch: 0 [12480/60000 (21%)]\tloss=0.7795\n",
      "2023-01-12T18:56:07Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=0.8194\n",
      "2023-01-12T18:56:07Z INFO     Train Epoch: 0 [13120/60000 (22%)]\tloss=1.1227\n",
      "2023-01-12T18:56:08Z INFO     Train Epoch: 0 [13440/60000 (22%)]\tloss=0.9001\n",
      "2023-01-12T18:56:08Z INFO     Train Epoch: 0 [13760/60000 (23%)]\tloss=0.9062\n",
      "2023-01-12T18:56:09Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=0.9513\n",
      "2023-01-12T18:56:10Z INFO     Train Epoch: 0 [14400/60000 (24%)]\tloss=0.8561\n",
      "2023-01-12T18:56:11Z INFO     Train Epoch: 0 [14720/60000 (25%)]\tloss=0.7293\n",
      "2023-01-12T18:56:12Z INFO     Train Epoch: 0 [15040/60000 (25%)]\tloss=0.8429\n",
      "2023-01-12T18:56:12Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=0.9922\n",
      "2023-01-12T18:56:13Z INFO     Train Epoch: 0 [15680/60000 (26%)]\tloss=0.7432\n",
      "2023-01-12T18:56:15Z INFO     Train Epoch: 0 [16000/60000 (27%)]\tloss=1.0907\n",
      "2023-01-12T18:56:16Z INFO     Train Epoch: 0 [16320/60000 (27%)]\tloss=0.5217\n",
      "2023-01-12T18:56:16Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=0.9695\n",
      "2023-01-12T18:56:17Z INFO     Train Epoch: 0 [16960/60000 (28%)]\tloss=0.7314\n",
      "2023-01-12T18:56:17Z INFO     Train Epoch: 0 [17280/60000 (29%)]\tloss=0.8013\n",
      "2023-01-12T18:56:18Z INFO     Train Epoch: 0 [17600/60000 (29%)]\tloss=0.6232\n",
      "2023-01-12T18:56:18Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=0.6004\n",
      "2023-01-12T18:56:19Z INFO     Train Epoch: 0 [18240/60000 (30%)]\tloss=1.1647\n",
      "2023-01-12T18:56:19Z INFO     Train Epoch: 0 [18560/60000 (31%)]\tloss=1.1845\n",
      "2023-01-12T18:56:20Z INFO     Train Epoch: 0 [18880/60000 (31%)]\tloss=0.7494\n",
      "2023-01-12T18:56:21Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=0.6017\n",
      "2023-01-12T18:56:21Z INFO     Train Epoch: 0 [19520/60000 (33%)]\tloss=0.8297\n",
      "2023-01-12T18:56:22Z INFO     Train Epoch: 0 [19840/60000 (33%)]\tloss=0.8827\n",
      "2023-01-12T18:56:22Z INFO     Train Epoch: 0 [20160/60000 (34%)]\tloss=1.1165\n",
      "2023-01-12T18:56:23Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=0.5660\n",
      "2023-01-12T18:56:23Z INFO     Train Epoch: 0 [20800/60000 (35%)]\tloss=0.9627\n",
      "2023-01-12T18:56:24Z INFO     Train Epoch: 0 [21120/60000 (35%)]\tloss=0.4962\n",
      "2023-01-12T18:56:24Z INFO     Train Epoch: 0 [21440/60000 (36%)]\tloss=1.0196\n",
      "2023-01-12T18:56:25Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=0.7316\n",
      "2023-01-12T18:56:25Z INFO     Train Epoch: 0 [22080/60000 (37%)]\tloss=0.7878\n",
      "2023-01-12T18:56:26Z INFO     Train Epoch: 0 [22400/60000 (37%)]\tloss=0.5671\n",
      "2023-01-12T18:56:27Z INFO     Train Epoch: 0 [22720/60000 (38%)]\tloss=0.6081\n",
      "2023-01-12T18:56:27Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0035\n",
      "2023-01-12T18:56:28Z INFO     Train Epoch: 0 [23360/60000 (39%)]\tloss=0.5702\n",
      "2023-01-12T18:56:30Z INFO     Train Epoch: 0 [23680/60000 (39%)]\tloss=0.7771\n",
      "2023-01-12T18:56:31Z INFO     Train Epoch: 0 [24000/60000 (40%)]\tloss=0.9109\n",
      "2023-01-12T18:56:32Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=0.8138\n",
      "2023-01-12T18:56:32Z INFO     Train Epoch: 0 [24640/60000 (41%)]\tloss=0.7430\n",
      "2023-01-12T18:56:33Z INFO     Train Epoch: 0 [24960/60000 (42%)]\tloss=0.7815\n",
      "2023-01-12T18:56:33Z INFO     Train Epoch: 0 [25280/60000 (42%)]\tloss=0.5246\n",
      "2023-01-12T18:56:34Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=0.7377\n",
      "2023-01-12T18:56:34Z INFO     Train Epoch: 0 [25920/60000 (43%)]\tloss=0.6146\n",
      "2023-01-12T18:56:35Z INFO     Train Epoch: 0 [26240/60000 (44%)]\tloss=0.9728\n",
      "2023-01-12T18:56:35Z INFO     Train Epoch: 0 [26560/60000 (44%)]\tloss=0.7355\n",
      "2023-01-12T18:56:36Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.6064\n",
      "2023-01-12T18:56:36Z INFO     Train Epoch: 0 [27200/60000 (45%)]\tloss=1.0344\n",
      "2023-01-12T18:56:37Z INFO     Train Epoch: 0 [27520/60000 (46%)]\tloss=0.4730\n",
      "2023-01-12T18:56:38Z INFO     Train Epoch: 0 [27840/60000 (46%)]\tloss=0.7260\n",
      "2023-01-12T18:56:38Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=0.8061\n",
      "2023-01-12T18:56:39Z INFO     Train Epoch: 0 [28480/60000 (47%)]\tloss=0.8537\n",
      "2023-01-12T18:56:39Z INFO     Train Epoch: 0 [28800/60000 (48%)]\tloss=1.0247\n",
      "2023-01-12T18:56:40Z INFO     Train Epoch: 0 [29120/60000 (49%)]\tloss=0.6724\n",
      "2023-01-12T18:56:41Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=0.9595\n",
      "2023-01-12T18:56:43Z INFO     Train Epoch: 0 [29760/60000 (50%)]\tloss=0.7610\n",
      "2023-01-12T18:56:44Z INFO     Train Epoch: 0 [30080/60000 (50%)]\tloss=0.9843\n",
      "2023-01-12T18:56:45Z INFO     Train Epoch: 0 [30400/60000 (51%)]\tloss=0.6334\n",
      "2023-01-12T18:56:45Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.6374\n",
      "2023-01-12T18:56:46Z INFO     Train Epoch: 0 [31040/60000 (52%)]\tloss=0.5124\n",
      "2023-01-12T18:56:46Z INFO     Train Epoch: 0 [31360/60000 (52%)]\tloss=0.5240\n",
      "2023-01-12T18:56:47Z INFO     Train Epoch: 0 [31680/60000 (53%)]\tloss=0.6984\n",
      "2023-01-12T18:56:47Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=0.8143\n",
      "2023-01-12T18:56:48Z INFO     Train Epoch: 0 [32320/60000 (54%)]\tloss=0.6173\n",
      "2023-01-12T18:56:49Z INFO     Train Epoch: 0 [32640/60000 (54%)]\tloss=0.6989\n",
      "2023-01-12T18:56:49Z INFO     Train Epoch: 0 [32960/60000 (55%)]\tloss=0.6109\n",
      "2023-01-12T18:56:50Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.5810\n",
      "2023-01-12T18:56:50Z INFO     Train Epoch: 0 [33600/60000 (56%)]\tloss=0.5392\n",
      "2023-01-12T18:56:51Z INFO     Train Epoch: 0 [33920/60000 (57%)]\tloss=0.4317\n",
      "2023-01-12T18:56:51Z INFO     Train Epoch: 0 [34240/60000 (57%)]\tloss=0.4624\n",
      "2023-01-12T18:56:52Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.3868\n",
      "2023-01-12T18:56:52Z INFO     Train Epoch: 0 [34880/60000 (58%)]\tloss=0.6871\n",
      "2023-01-12T18:56:53Z INFO     Train Epoch: 0 [35200/60000 (59%)]\tloss=0.5277\n",
      "2023-01-12T18:56:54Z INFO     Train Epoch: 0 [35520/60000 (59%)]\tloss=0.5487\n",
      "2023-01-12T18:56:54Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.5509\n",
      "2023-01-12T18:56:55Z INFO     Train Epoch: 0 [36160/60000 (60%)]\tloss=0.7043\n",
      "2023-01-12T18:56:55Z INFO     Train Epoch: 0 [36480/60000 (61%)]\tloss=0.7568\n",
      "2023-01-12T18:56:56Z INFO     Train Epoch: 0 [36800/60000 (61%)]\tloss=0.6199\n",
      "2023-01-12T18:56:56Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7296\n",
      "2023-01-12T18:56:57Z INFO     Train Epoch: 0 [37440/60000 (62%)]\tloss=0.5492\n",
      "2023-01-12T18:56:58Z INFO     Train Epoch: 0 [37760/60000 (63%)]\tloss=0.4943\n",
      "2023-01-12T18:56:59Z INFO     Train Epoch: 0 [38080/60000 (63%)]\tloss=0.8262\n",
      "2023-01-12T18:57:01Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=0.6767\n",
      "2023-01-12T18:57:02Z INFO     Train Epoch: 0 [38720/60000 (65%)]\tloss=0.6093\n",
      "2023-01-12T18:57:02Z INFO     Train Epoch: 0 [39040/60000 (65%)]\tloss=0.5222\n",
      "2023-01-12T18:57:03Z INFO     Train Epoch: 0 [39360/60000 (66%)]\tloss=0.4399\n",
      "2023-01-12T18:57:03Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.6005\n",
      "2023-01-12T18:57:04Z INFO     Train Epoch: 0 [40000/60000 (67%)]\tloss=0.5421\n",
      "2023-01-12T18:57:04Z INFO     Train Epoch: 0 [40320/60000 (67%)]\tloss=0.4670\n",
      "2023-01-12T18:57:05Z INFO     Train Epoch: 0 [40640/60000 (68%)]\tloss=0.2799\n",
      "2023-01-12T18:57:06Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=0.5594\n",
      "2023-01-12T18:57:06Z INFO     Train Epoch: 0 [41280/60000 (69%)]\tloss=0.7234\n",
      "2023-01-12T18:57:07Z INFO     Train Epoch: 0 [41600/60000 (69%)]\tloss=0.8179\n",
      "2023-01-12T18:57:08Z INFO     Train Epoch: 0 [41920/60000 (70%)]\tloss=0.5361\n",
      "2023-01-12T18:57:08Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6700\n",
      "2023-01-12T18:57:09Z INFO     Train Epoch: 0 [42560/60000 (71%)]\tloss=0.4328\n",
      "2023-01-12T18:57:09Z INFO     Train Epoch: 0 [42880/60000 (71%)]\tloss=0.7155\n",
      "2023-01-12T18:57:10Z INFO     Train Epoch: 0 [43200/60000 (72%)]\tloss=0.6536\n",
      "2023-01-12T18:57:11Z INFO     Train Epoch: 0 [43520/60000 (73%)]\tloss=0.4034\n",
      "2023-01-12T18:57:12Z INFO     Train Epoch: 0 [43840/60000 (73%)]\tloss=0.6295\n",
      "2023-01-12T18:57:13Z INFO     Train Epoch: 0 [44160/60000 (74%)]\tloss=0.6419\n",
      "2023-01-12T18:57:15Z INFO     Train Epoch: 0 [44480/60000 (74%)]\tloss=0.4257\n",
      "2023-01-12T18:57:15Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.6005\n",
      "2023-01-12T18:57:16Z INFO     Train Epoch: 0 [45120/60000 (75%)]\tloss=0.5280\n",
      "2023-01-12T18:57:17Z INFO     Train Epoch: 0 [45440/60000 (76%)]\tloss=0.7624\n",
      "2023-01-12T18:57:17Z INFO     Train Epoch: 0 [45760/60000 (76%)]\tloss=0.4500\n",
      "2023-01-12T18:57:18Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.6136\n",
      "2023-01-12T18:57:18Z INFO     Train Epoch: 0 [46400/60000 (77%)]\tloss=0.4631\n",
      "2023-01-12T18:57:19Z INFO     Train Epoch: 0 [46720/60000 (78%)]\tloss=0.6543\n",
      "2023-01-12T18:57:19Z INFO     Train Epoch: 0 [47040/60000 (78%)]\tloss=0.3783\n",
      "2023-01-12T18:57:20Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.6068\n",
      "2023-01-12T18:57:20Z INFO     Train Epoch: 0 [47680/60000 (79%)]\tloss=0.4288\n",
      "2023-01-12T18:57:21Z INFO     Train Epoch: 0 [48000/60000 (80%)]\tloss=0.5632\n",
      "2023-01-12T18:57:22Z INFO     Train Epoch: 0 [48320/60000 (81%)]\tloss=0.5509\n",
      "2023-01-12T18:57:22Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.7985\n",
      "2023-01-12T18:57:23Z INFO     Train Epoch: 0 [48960/60000 (82%)]\tloss=0.5953\n",
      "2023-01-12T18:57:23Z INFO     Train Epoch: 0 [49280/60000 (82%)]\tloss=0.6759\n",
      "2023-01-12T18:57:24Z INFO     Train Epoch: 0 [49600/60000 (83%)]\tloss=0.3233\n",
      "2023-01-12T18:57:24Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.3583\n",
      "2023-01-12T18:57:25Z INFO     Train Epoch: 0 [50240/60000 (84%)]\tloss=0.5348\n",
      "2023-01-12T18:57:25Z INFO     Train Epoch: 0 [50560/60000 (84%)]\tloss=0.8532\n",
      "2023-01-12T18:57:26Z INFO     Train Epoch: 0 [50880/60000 (85%)]\tloss=0.4251\n",
      "2023-01-12T18:57:27Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.4953\n",
      "2023-01-12T18:57:27Z INFO     Train Epoch: 0 [51520/60000 (86%)]\tloss=0.5538\n",
      "2023-01-12T18:57:28Z INFO     Train Epoch: 0 [51840/60000 (86%)]\tloss=0.7728\n",
      "2023-01-12T18:57:29Z INFO     Train Epoch: 0 [52160/60000 (87%)]\tloss=0.4604\n",
      "2023-01-12T18:57:31Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8828\n",
      "2023-01-12T18:57:32Z INFO     Train Epoch: 0 [52800/60000 (88%)]\tloss=0.5369\n",
      "2023-01-12T18:57:32Z INFO     Train Epoch: 0 [53120/60000 (89%)]\tloss=0.7731\n",
      "2023-01-12T18:57:33Z INFO     Train Epoch: 0 [53440/60000 (89%)]\tloss=0.6234\n",
      "2023-01-12T18:57:33Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.5501\n",
      "2023-01-12T18:57:34Z INFO     Train Epoch: 0 [54080/60000 (90%)]\tloss=0.7707\n",
      "2023-01-12T18:57:34Z INFO     Train Epoch: 0 [54400/60000 (91%)]\tloss=0.7441\n",
      "2023-01-12T18:57:35Z INFO     Train Epoch: 0 [54720/60000 (91%)]\tloss=0.5040\n",
      "2023-01-12T18:57:36Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.4233\n",
      "2023-01-12T18:57:36Z INFO     Train Epoch: 0 [55360/60000 (92%)]\tloss=0.4983\n",
      "2023-01-12T18:57:37Z INFO     Train Epoch: 0 [55680/60000 (93%)]\tloss=0.5547\n",
      "2023-01-12T18:57:37Z INFO     Train Epoch: 0 [56000/60000 (93%)]\tloss=0.7808\n",
      "2023-01-12T18:57:38Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.5937\n",
      "2023-01-12T18:57:38Z INFO     Train Epoch: 0 [56640/60000 (94%)]\tloss=0.3243\n",
      "2023-01-12T18:57:39Z INFO     Train Epoch: 0 [56960/60000 (95%)]\tloss=0.7926\n",
      "2023-01-12T18:57:39Z INFO     Train Epoch: 0 [57280/60000 (95%)]\tloss=0.5203\n",
      "2023-01-12T18:57:40Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.5806\n",
      "2023-01-12T18:57:41Z INFO     Train Epoch: 0 [57920/60000 (97%)]\tloss=0.2864\n",
      "2023-01-12T18:57:42Z INFO     Train Epoch: 0 [58240/60000 (97%)]\tloss=0.4806\n",
      "2023-01-12T18:57:43Z INFO     Train Epoch: 0 [58560/60000 (98%)]\tloss=0.5448\n",
      "2023-01-12T18:57:44Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7353\n",
      "2023-01-12T18:57:45Z INFO     Train Epoch: 0 [59200/60000 (99%)]\tloss=0.3771\n",
      "2023-01-12T18:57:45Z INFO     Train Epoch: 0 [59520/60000 (99%)]\tloss=0.5527\n",
      "2023-01-12T18:57:46Z INFO     Train Epoch: 0 [59840/60000 (100%)]\tloss=0.5935\n",
      "\n"
     ]
    }
   ],
   "source": [
    "training_client.get_job_logs(pytorchjob_name, container=\"pytorch\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17b0ca43-1936-4708-b03b-3ab9ac2bbdea",
   "metadata": {},
   "source": [
    "## Delete PyTorchJob\n",
    "\n",
    "When PyTorchJob is finished, you can delete the resource."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorchJob kubeflow-user-example-com/train-pytorch has been deleted\n"
     ]
    }
   ],
   "source": [
    "training_client.delete_pytorchjob(pytorchjob_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9641e9f-551d-44d5-872b-002fffaedcef",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
